rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  554925 29.7    1242271 66.4   686457 36.7
## Vcells 1016538  7.8    8388608 64.0  1876640 14.4
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)


`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )

pss_long[pss_long == ""] = "-"
gn[is.na(gn)] = "-"
sn[is.na(sn)] = "-"

ath.annot = pss_long %>%
  dplyr::full_join(gn, by = c("id" = "V1")) %>%
  dplyr::full_join(sn, by = c("id" = "V1"))
group = "others"

Note: be careful with 35.2 bin matches

params_list <- list(
  plantName = 'stu'
  , plantNameOut = "potato"
  , plantDirOut = file.path('..', 'reports', group, "potato")
  , flag = 1
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000021ba2e1b540>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-11] | |…… | 11% | |…….. | 15% [unnamed-chunk-12] | |……… | 19% | |……….. | 22% [unnamed-chunk-13] | |…………. | 26% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-15] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-16] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-17] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-18] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-19] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-20] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-21] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-22] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-23] | |……………………………………………| 100%

cat(child_content)

3 Subsection: stu

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

3.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  15035 75961 17547 51397 23243 33885
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID            to_plant source 
##    <chr>       <chr>      <chr>                <chr>    <chr>  
##  1 AT2G37600   ath        PGSC0003DMG400000168 stu      FastOMA
##  2 AT3G53740   ath        PGSC0003DMG400000168 stu      FastOMA
##  3 AT3G23880   ath        Sotub12g029980       stu      FastOMA
##  4 AT2G13770   ath        Sotub12g032820       stu      FastOMA
##  5 AT1G01220   ath        Soltu.DM.01G035280   stu      MCScanX
##  6 AT1G01225   ath        Soltu.DM.01G035270   stu      MCScanX
##  7 ATMG01080   ath        PGSC0003DMG400002202 stu      MCScanX
##  8 ATMG01110   ath        Soltu.DM.12G017350   stu      MCScanX
##  9 AT1G56560   ath        Soltu.DM.01G018690   stu      OrthoDB
## 10 AT3G06500   ath        Soltu.DM.01G018690   stu      OrthoDB
## 11 AT5G47320   ath        Sotub01g015810       stu      OrthoDB
## 12 AT2G28815   ath        Soltu.DM.01G010300   stu      OrthoDB
## 13 AT1G61070   ath        Soltu.DM.01G000020   stu      PLAZA  
## 14 AT2G02100   ath        Soltu.DM.01G000020   stu      PLAZA  
## 15 AT4G28140   ath        PGSC0003DMG400041562 stu      PLAZA  
## 16 AT3G15353   ath        PGSC0003DMG400015318 stu      PLAZA  
## 17 AT1G01030   ath        Soltu.DM.08G005020   stu      RBH    
## 18 AT1G01030   ath        Soltu.DM.08G005030   stu      RBH    
## 19 ATMG01360   ath        Sotub03g026800       stu      RBH    
## 20 ATMG01360   ath        Sotub07g016160       stu      RBH    
## 21 AT5G01020   ath        Soltu.DM.10G018260   stu      compara
## 22 AT5G01030   ath        Soltu.DM.09G001170   stu      compara
## 23 AT1G80950   ath        Soltu.DM.04G034970   stu      compara
## 24 AT1G80980   ath        Soltu.DM.03G031250   stu      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1962885 104.9    3294447 176.0  3126594 167.0
## Vcells 19373211 147.9   31696991 241.9 31627982 241.4

3.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23082
length(unique(dt$to_geneID))
## [1] 30489
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   15035   75961   17547   51397   23243   33885
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

3.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

3.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

3.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

3.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
##  FALSE   TRUE 
## 115350    428
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "PGSC0003DMG400002971" "PGSC0003DMG401011339" "PGSC0003DMG401011339"
##   [4] "Soltu.DM.S000060"     "Soltu.DM.S000060"     "Soltu.DM.S000060"    
##   [7] "Soltu.DM.S000060"     "Soltu.DM.S000060"     "Soltu.DM.S000060"    
##  [10] "Soltu.DM.S000060"     "Soltu.DM.S000070"     "Soltu.DM.S000070"    
##  [13] "Soltu.DM.S000100"     "Soltu.DM.S000110"     "Soltu.DM.S000120"    
##  [16] "Soltu.DM.S000120"     "Soltu.DM.S000140"     "Soltu.DM.S000140"    
##  [19] "Soltu.DM.S000160"     "Soltu.DM.S000160"     "Soltu.DM.S000170"    
##  [22] "Soltu.DM.S000170"     "Soltu.DM.S000180"     "Soltu.DM.S000210"    
##  [25] "Soltu.DM.S000210"     "Soltu.DM.S000240"     "Soltu.DM.S000240"    
##  [28] "Soltu.DM.S000240"     "Soltu.DM.S000270"     "Soltu.DM.S000280"    
##  [31] "Soltu.DM.S000290"     "Soltu.DM.S000290"     "Soltu.DM.S000320"    
##  [34] "Soltu.DM.S000330"     "Soltu.DM.S000360"     "Soltu.DM.S000360"    
##  [37] "Soltu.DM.S000420"     "Soltu.DM.S000420"     "Soltu.DM.S000430"    
##  [40] "Soltu.DM.S000430"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [43] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [46] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [49] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [52] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000520"    
##  [55] "Soltu.DM.S000520"     "Soltu.DM.S000780"     "Soltu.DM.S000780"    
##  [58] "Soltu.DM.S000830"     "Soltu.DM.S000840"     "Soltu.DM.S000850"    
##  [61] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000850"    
##  [64] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000850"    
##  [67] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000850"    
##  [70] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000870"    
##  [73] "Soltu.DM.S000890"     "Soltu.DM.S000920"     "Soltu.DM.S000950"    
##  [76] "Soltu.DM.S000980"     "Soltu.DM.S000990"     "Soltu.DM.S000990"    
##  [79] "Soltu.DM.S001000"     "Soltu.DM.S001000"     "Soltu.DM.S001000"    
##  [82] "Soltu.DM.S001000"     "Soltu.DM.S001100"     "Soltu.DM.S001120"    
##  [85] "Soltu.DM.S001120"     "Soltu.DM.S001120"     "Soltu.DM.S001120"    
##  [88] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
##  [91] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
##  [94] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
##  [97] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
## [100] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
## [103] "Soltu.DM.S001270"     "Soltu.DM.S001270"     "Soltu.DM.S001270"    
## [106] "Soltu.DM.S001280"     "Soltu.DM.S001300"     "Soltu.DM.S001340"    
## [109] "Soltu.DM.S001340"     "Soltu.DM.S001350"     "Soltu.DM.S001470"    
## [112] "Soltu.DM.S001470"     "Soltu.DM.S001520"     "Soltu.DM.S001520"    
## [115] "Soltu.DM.S001520"     "Soltu.DM.S001520"     "Soltu.DM.S001540"    
## [118] "Soltu.DM.S001540"     "Soltu.DM.S001600"     "Soltu.DM.S001650"    
## [121] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [124] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [127] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [130] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [133] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [136] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [139] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [142] "Soltu.DM.S001650"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [145] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [148] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [151] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [154] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [157] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [160] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [163] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [166] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [169] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [172] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [175] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [178] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [181] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [184] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [187] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001680"    
## [190] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [193] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [196] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [199] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [202] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [205] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [208] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [211] "Soltu.DM.S001680"     "Soltu.DM.S001700"     "Soltu.DM.S001700"    
## [214] "Soltu.DM.S001700"     "Soltu.DM.S001700"     "Soltu.DM.S001700"    
## [217] "Soltu.DM.S001700"     "Soltu.DM.S001700"     "Soltu.DM.S001700"    
## [220] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [223] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [226] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [229] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [232] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001750"    
## [235] "Soltu.DM.S001770"     "Soltu.DM.S001810"     "Soltu.DM.S001840"    
## [238] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [241] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [244] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [247] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [250] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [253] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [256] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [259] "Soltu.DM.S001840"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [262] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [265] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [268] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [271] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [274] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [277] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [280] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [283] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [286] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [289] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [292] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [295] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [298] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [301] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [304] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001870"    
## [307] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [310] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [313] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [316] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [319] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [322] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [325] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [328] "Soltu.DM.S001870"     "Soltu.DM.S001970"     "Soltu.DM.S001970"    
## [331] "Soltu.DM.S001970"     "Soltu.DM.S001970"     "Soltu.DM.S002030"    
## [334] "Soltu.DM.S002030"     "Soltu.DM.S002050"     "Soltu.DM.S002050"    
## [337] "Soltu.DM.S002090"     "Soltu.DM.S002100"     "Soltu.DM.S002100"    
## [340] "Soltu.DM.S002100"     "Soltu.DM.S002140"     "Soltu.DM.S002160"    
## [343] "Soltu.DM.S002160"     "Soltu.DM.S002160"     "Soltu.DM.S002160"    
## [346] "Soltu.DM.S002180"     "Soltu.DM.S002180"     "Soltu.DM.S002200"    
## [349] "Soltu.DM.S002200"     "Soltu.DM.S002200"     "Soltu.DM.S002200"    
## [352] "Soltu.DM.S002210"     "Soltu.DM.S002210"     "Soltu.DM.S002210"    
## [355] "Soltu.DM.S002210"     "Soltu.DM.S002220"     "Soltu.DM.S002220"    
## [358] "Soltu.DM.S002220"     "Soltu.DM.S002220"     "Soltu.DM.S002230"    
## [361] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [364] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [367] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [370] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [373] "Soltu.DM.S002230"     "Soltu.DM.S002260"     "Soltu.DM.S002260"    
## [376] "Soltu.DM.S002260"     "Soltu.DM.S002260"     "Soltu.DM.S002260"    
## [379] "Soltu.DM.S002260"     "Soltu.DM.S002330"     "Soltu.DM.S002330"    
## [382] "Soltu.DM.S002340"     "Soltu.DM.S002350"     "Soltu.DM.S002350"    
## [385] "Soltu.DM.S002350"     "Soltu.DM.S002350"     "Soltu.DM.S002350"    
## [388] "Soltu.DM.S002350"     "Soltu.DM.S002350"     "Soltu.DM.S002350"    
## [391] "Soltu.DM.S002350"     "Soltu.DM.S002420"     "Soltu.DM.S002420"    
## [394] "Soltu.DM.S002440"     "Soltu.DM.S002460"     "Soltu.DM.S002490"    
## [397] "Soltu.DM.S002490"     "Soltu.DM.S002490"     "Soltu.DM.S002490"    
## [400] "Soltu.DM.S002490"     "Soltu.DM.S002490"     "Soltu.DM.S002490"    
## [403] "Soltu.DM.S002510"     "Soltu.DM.S002640"     "Soltu.DM.S002640"    
## [406] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [409] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [412] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [415] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [418] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [421] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [424] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [427] "Soltu.DM.S002650"     "Soltu.DM.S002650"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   7069392 377.6   13748575  734.3  13748575  734.3
## Vcells 119359765 910.7  188153622 1435.5 188084823 1435.0

3.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 28662

3.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 23082 
##      # stu unigue genes: 30489 
##      # ath highest connection degree:  88 
##      # stu highest connection degree:  59 
##      # genes in ath with degree > 30:  473 
##      # genes in stu with degree > 30:  310
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          59808          28662          17368           9940
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       30613      17436    15279           7945
##   2       10389       4009     1554           1200
##   3        7039       2383      355            402
##   4        5503       1954      123            230
##   5        4388       1911       51            120
##   6        1876        969        6             43
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
## 115778
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

3.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          24097           8206           2176           1652
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1706       1279     1179            271
##   2        6389       1177      573            737
##   3        5119       1239      255            271
##   4        4790       1723      113            214
##   5        4217       1819       50            116
##   6        1876        969        6             43
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 18684 
##      # stu unigue genes: 22686 
##      # ath highest connection degree:  52 
##      # stu highest connection degree:  45 
##      # genes in ath with degree > 30:  2 
##      # genes in stu with degree > 30:  2
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 17547
## 2:                     compara  6740
## 3:                       PLAZA  6084
## 4: OrthoDB_FastOMA_RBH_MapMan4  1491
## 5:     FastOMA_OrthoDB_MapMan4  2503
## 6:         OrthoDB_RBH_MapMan4   828
## 7:         FastOMA_RBH_MapMan4   938
## 8:                      reject 79647

3.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                         204                          98 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          58                         859 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          84                          35 
##                       PLAZA                      reject 
##                         311                        2944
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'sly'
  , plantNameOut = "tomato"
  , plantDirOut = file.path('..', 'reports', group, "tomato")
  , flag = 1
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000021c10991738>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-39] | |…… | 11% | |…….. | 15% [unnamed-chunk-40] | |……… | 19% | |……….. | 22% [unnamed-chunk-41] | |…………. | 26% | |…………… | 30% [unnamed-chunk-42] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-43] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-44] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-45] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-46] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-47] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-48] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-49] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-50] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-51] | |……………………………………………| 100%

cat(child_content)

4 Subsection: sly

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  14733 54079 17647 42001 22210 26629
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT1G06160   ath        PRAM_106208    sly      FastOMA
##  2 AT2G31230   ath        PRAM_106208    sly      FastOMA
##  3 AT3G63390   ath        Solyc12T002847 sly      FastOMA
##  4 AT1G55320   ath        Solyc12T002849 sly      FastOMA
##  5 AT1G01180   ath        Solyc01T003002 sly      MCScanX
##  6 AT1G01190   ath        Solyc01T002993 sly      MCScanX
##  7 ATMG01110   ath        Solyc11T001808 sly      MCScanX
##  8 ATMG01330   ath        Solyc11T001788 sly      MCScanX
##  9 AT2G19940   ath        Solyc01T004061 sly      OrthoDB
## 10 AT3G04650   ath        Solyc01T003078 sly      OrthoDB
## 11 AT2G07633   ath        Solyc11T001073 sly      OrthoDB
## 12 AT2G07633   ath        Solyc11T001072 sly      OrthoDB
## 13 AT5G07610   ath        Solyc05T002277 sly      PLAZA  
## 14 AT4G04330   ath        Solyc02T001764 sly      PLAZA  
## 15 AT3G18160   ath        Solyc12T001775 sly      PLAZA  
## 16 AT3G01510   ath        Solyc12T001230 sly      PLAZA  
## 17 AT1G01030   ath        Solyc04T000117 sly      RBH    
## 18 AT1G01030   ath        Solyc08T000375 sly      RBH    
## 19 ATMG01360   ath        Solyc11T001913 sly      RBH    
## 20 ATMG01410   ath        Solyc11T001822 sly      RBH    
## 21 AT3G20015   ath        Solyc04T000286 sly      compara
## 22 AT5G66990   ath        Solyc02T000133 sly      compara
## 23 AT1G69770   ath        Solyc12T002848 sly      compara
## 24 AT1G55350   ath        Solyc12T002850 sly      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4329052 231.2   10998861  587.5  13748576  734.3
## Vcells 95202906 726.4  188153622 1435.5 188136477 1435.4

4.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22707
length(unique(dt$to_geneID))
## [1] 23599
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   14733   54079   17647   42001   22210   26629
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

4.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE  TRUE 
## 83343   980
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "PRAM_106208"      "PRAM_106208"      "PRAM_106208"     
##   [4] "PRAM_106208.1"    "PRAM_106208.1"    "PRAM_106208.1"   
##   [7] "PRAM_106208.1"    "PRAM_106208.1"    "PRAM_106382.1"   
##  [10] "PRAM_116774"      "PRAM_116774"      "PRAM_121333"     
##  [13] "PRAM_121333"      "PRAM_121333"      "PRAM_121333"     
##  [16] "PRAM_12783"       "PRAM_12783"       "PRAM_12783"      
##  [19] "PRAM_12783"       "PRAM_128172"      "PRAM_128172"     
##  [22] "PRAM_128172"      "PRAM_128172"      "PRAM_128172"     
##  [25] "PRAM_139928.1"    "PRAM_139928.1"    "PRAM_139928.1.p1"
##  [28] "PRAM_139928.1.p1" "PRAM_140960.1.p1" "PRAM_141672.1.p1"
##  [31] "PRAM_141691"      "PRAM_141691"      "PRAM_141691"     
##  [34] "PRAM_141691"      "PRAM_141691"      "PRAM_141691"     
##  [37] "PRAM_141691.1"    "PRAM_141691.1"    "PRAM_141691.1"   
##  [40] "PRAM_141691.1"    "PRAM_141691.1"    "PRAM_141691.1.p1"
##  [43] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
##  [46] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
##  [49] "PRAM_141869"      "PRAM_141869"      "PRAM_141869.1"   
##  [52] "PRAM_141869.1"    "PRAM_141870"      "PRAM_141870"     
##  [55] "PRAM_141870"      "PRAM_141870"      "PRAM_141870"     
##  [58] "PRAM_141870"      "PRAM_141870"      "PRAM_141870"     
##  [61] "PRAM_141870"      "PRAM_141870"      "PRAM_141870"     
##  [64] "PRAM_141870.1"    "PRAM_141870.1"    "PRAM_141870.1"   
##  [67] "PRAM_141870.1"    "PRAM_141870.1"    "PRAM_141870.1.p1"
##  [70] "PRAM_141870.1.p1" "PRAM_141870.1.p1" "PRAM_141870.1.p1"
##  [73] "PRAM_141870.1.p1" "PRAM_141872"      "PRAM_141872.1"   
##  [76] "PRAM_141872.1.p1" "PRAM_141875.1"    "PRAM_141875.1"   
##  [79] "PRAM_141875.1"    "PRAM_141875.1"    "PRAM_141875.1"   
##  [82] "PRAM_141875.1"    "PRAM_141875.1"    "PRAM_141876.1"   
##  [85] "PRAM_141876.1"    "PRAM_141877.1"    "PRAM_141877.1.p2"
##  [88] "PRAM_15610"       "PRAM_15610.1"     "PRAM_15610.1.p1" 
##  [91] "PRAM_15610.1.p1"  "PRAM_15612.1"     "PRAM_15612.1.p1" 
##  [94] "PRAM_15614"       "PRAM_15614.1.p1"  "PRAM_15617"      
##  [97] "PRAM_15617.1"     "PRAM_15617.1.p1"  "PRAM_15671"      
## [100] "PRAM_15671.1"     "PRAM_15671.1.p3"  "PRAM_15687.1.p1" 
## [103] "PRAM_15687.1.p1"  "PRAM_15687.1.p1"  "PRAM_15708.1.p1" 
## [106] "PRAM_15708.1.p1"  "PRAM_15708.1.p1"  "PRAM_15758"      
## [109] "PRAM_15758.1"     "PRAM_15758.1"     "PRAM_15758.1"    
## [112] "PRAM_15758.1"     "PRAM_15758.1"     "PRAM_15758.1.p1" 
## [115] "PRAM_15758.1.p1"  "PRAM_15763"       "PRAM_15763.1"    
## [118] "PRAM_15773"       "PRAM_15773"       "PRAM_15773.1"    
## [121] "PRAM_15773.1.p1"  "PRAM_15781.1"     "PRAM_15781.1"    
## [124] "PRAM_15781.1"     "PRAM_15781.1"     "PRAM_15781.1"    
## [127] "PRAM_15781.1"     "PRAM_15781.1.p1"  "PRAM_15821.1"    
## [130] "PRAM_15848"       "PRAM_15848"       "PRAM_15848"      
## [133] "PRAM_15848"       "PRAM_15848.1"     "PRAM_15848.1"    
## [136] "PRAM_15848.1"     "PRAM_15848.1"     "PRAM_15848.1"    
## [139] "PRAM_15848.1"     "PRAM_15848.1"     "PRAM_15848.1"    
## [142] "PRAM_15848.1.p1"  "PRAM_15848.1.p1"  "PRAM_15856"      
## [145] "PRAM_15856"       "PRAM_15856"       "PRAM_15856"      
## [148] "PRAM_15856"       "PRAM_15856"       "PRAM_15856"      
## [151] "PRAM_15856"       "PRAM_15856"       "PRAM_15856"      
## [154] "PRAM_15856"       "PRAM_15856"       "PRAM_15856.1"    
## [157] "PRAM_15856.1"     "PRAM_15856.1"     "PRAM_15856.1"    
## [160] "PRAM_15856.1"     "PRAM_15856.1"     "PRAM_15856.1.p1" 
## [163] "PRAM_16203"       "PRAM_16203"       "PRAM_16203"      
## [166] "PRAM_16203"       "PRAM_21846"       "PRAM_21846.1"    
## [169] "PRAM_21846.1"     "PRAM_21846.1"     "PRAM_21846.1"    
## [172] "PRAM_21846.1.p1"  "PRAM_21846.1.p1"  "PRAM_21846.1.p1" 
## [175] "PRAM_21846.1.p1"  "PRAM_21846.1.p1"  "PRAM_21846.1.p1" 
## [178] "PRAM_21934.1"     "PRAM_21934.1"     "PRAM_21934.1.p1" 
## [181] "PRAM_21934.1.p1"  "PRAM_22800"       "PRAM_22800"      
## [184] "PRAM_22800"       "PRAM_22800"       "PRAM_22800"      
## [187] "PRAM_22800"       "PRAM_22800"       "PRAM_22800.1"    
## [190] "PRAM_22800.1"     "PRAM_22800.1.p1"  "PRAM_22800.1.p1" 
## [193] "PRAM_23091"       "PRAM_23722"       "PRAM_23722"      
## [196] "PRAM_23722"       "PRAM_23722"       "PRAM_23722"      
## [199] "PRAM_23722"       "PRAM_23722"       "PRAM_23722"      
## [202] "PRAM_23722"       "PRAM_23722.1"     "PRAM_23722.1"    
## [205] "PRAM_23722.1"     "PRAM_23722.1"     "PRAM_23722.1.p1" 
## [208] "PRAM_23722.1.p1"  "PRAM_23722.1.p1"  "PRAM_23722.1.p1" 
## [211] "PRAM_24043.1"     "PRAM_24043.1"     "PRAM_24043.1"    
## [214] "PRAM_24077"       "PRAM_24077"       "PRAM_24077"      
## [217] "PRAM_24077.1"     "PRAM_24077.1"     "PRAM_24077.1"    
## [220] "PRAM_24077.1.p1"  "PRAM_24077.1.p1"  "PRAM_24077.1.p1" 
## [223] "PRAM_24440.1.p1"  "PRAM_24440.1.p1"  "PRAM_24440.1.p1" 
## [226] "PRAM_24440.1.p1"  "PRAM_24440.1.p1"  "PRAM_25343"      
## [229] "PRAM_25343.1"     "PRAM_25343.1"     "PRAM_25343.1.p1" 
## [232] "PRAM_25343.1.p1"  "PRAM_25466"       "PRAM_25466"      
## [235] "PRAM_25466"       "PRAM_25466"       "PRAM_25466.1"    
## [238] "PRAM_25466.1"     "PRAM_25466.1.p1"  "PRAM_25466.1.p1" 
## [241] "PRAM_25482"       "PRAM_25482"       "PRAM_25482"      
## [244] "PRAM_25482"       "PRAM_25482"       "PRAM_25482"      
## [247] "PRAM_25482"       "PRAM_25601"       "PRAM_25601"      
## [250] "PRAM_25601"       "PRAM_25601"       "PRAM_25601"      
## [253] "PRAM_25601"       "PRAM_25601"       "PRAM_25601"      
## [256] "PRAM_25601.1"     "PRAM_25601.1"     "PRAM_25601.1.p1" 
## [259] "PRAM_25757"       "PRAM_25757.1"     "PRAM_25757.1.p1" 
## [262] "PRAM_25806.1"     "PRAM_25806.1"     "PRAM_25806.1"    
## [265] "PRAM_25806.1.p1"  "PRAM_25806.1.p1"  "PRAM_25806.1.p1" 
## [268] "PRAM_25806.1.p1"  "PRAM_25806.1.p1"  "PRAM_25847"      
## [271] "PRAM_25847.1"     "PRAM_25847.1.p1"  "PRAM_25853.1"    
## [274] "PRAM_25853.1"     "PRAM_25853.1"     "PRAM_25853.1.p1" 
## [277] "PRAM_25853.1.p1"  "PRAM_25853.1.p1"  "PRAM_25853.1.p1" 
## [280] "PRAM_25855.1"     "PRAM_25855.1.p1"  "PRAM_25858"      
## [283] "PRAM_25858"       "PRAM_25858.1"     "PRAM_25858.1"    
## [286] "PRAM_25858.1.p1"  "PRAM_25858.1.p1"  "PRAM_25864"      
## [289] "PRAM_25864"       "PRAM_25864.1"     "PRAM_25864.1"    
## [292] "PRAM_25864.1.p1"  "PRAM_25864.1.p1"  "PRAM_25866"      
## [295] "PRAM_25866"       "PRAM_25866.1"     "PRAM_25866.1"    
## [298] "PRAM_25866.1.p1"  "PRAM_25866.1.p1"  "PRAM_25885"      
## [301] "PRAM_25885"       "PRAM_25885.1"     "PRAM_25885.1"    
## [304] "PRAM_25885.1.p1"  "PRAM_25885.1.p1"  "PRAM_25896.1"    
## [307] "PRAM_25896.1"     "PRAM_25896.1"     "PRAM_25896.1"    
## [310] "PRAM_25896.1"     "PRAM_25896.1"     "PRAM_25900"      
## [313] "PRAM_25900.1"     "PRAM_25900.1.p1"  "PRAM_25902"      
## [316] "PRAM_25902.1"     "PRAM_25902.1.p1"  "PRAM_25906.1"    
## [319] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [322] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [325] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [328] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [331] "PRAM_25911"       "PRAM_25911.1"     "PRAM_25911.1"    
## [334] "PRAM_25911.1.p1"  "PRAM_25921"       "PRAM_25921"      
## [337] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [340] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [343] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [346] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [349] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [352] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [355] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [358] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [361] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [364] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [367] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [370] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [373] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [376] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [379] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [382] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [385] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [388] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [391] "PRAM_25921"       "PRAM_25921"       "PRAM_25921.1"    
## [394] "PRAM_25921.1"     "PRAM_25921.1.p1"  "PRAM_25921.1.p1" 
## [397] "PRAM_25923.1"     "PRAM_25923.1"     "PRAM_25923.1.p1" 
## [400] "PRAM_25923.1.p1"  "PRAM_25927"       "PRAM_25927.1"    
## [403] "PRAM_25927.1.p1"  "PRAM_25933"       "PRAM_25933.1"    
## [406] "PRAM_25933.1.p1"  "PRAM_25935"       "PRAM_25935.1"    
## [409] "PRAM_25935.1.p1"  "PRAM_25937"       "PRAM_25937"      
## [412] "PRAM_25937"       "PRAM_25937"       "PRAM_25937"      
## [415] "PRAM_25937"       "PRAM_25937"       "PRAM_25937"      
## [418] "PRAM_25937.1.p1"  "PRAM_25937.1.p1"  "PRAM_25937.1.p1" 
## [421] "PRAM_25937.1.p1"  "PRAM_25939"       "PRAM_25939.1"    
## [424] "PRAM_25939.1.p1"  "PRAM_25945"       "PRAM_25945.1"    
## [427] "PRAM_25945.1.p1"  "PRAM_25951"       "PRAM_25951"      
## [430] "PRAM_25951"       "PRAM_25951.1"     "PRAM_25951.1"    
## [433] "PRAM_25951.1"     "PRAM_25951.1.p1"  "PRAM_25951.1.p1" 
## [436] "PRAM_25951.1.p1"  "PRAM_25994"       "PRAM_25994.1"    
## [439] "PRAM_25994.1.p1"  "PRAM_25996"       "PRAM_25996"      
## [442] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [445] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [448] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [451] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [454] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [457] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [460] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [463] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [466] "PRAM_25996"       "PRAM_25996.1"     "PRAM_25996.1"    
## [469] "PRAM_25996.1"     "PRAM_25996.1"     "PRAM_25996.1"    
## [472] "PRAM_25996.1"     "PRAM_25996.1"     "PRAM_25996.1"    
## [475] "PRAM_26005.1"     "PRAM_26005.1"     "PRAM_26007"      
## [478] "PRAM_26007.1"     "PRAM_26007.1.p1"  "PRAM_26016"      
## [481] "PRAM_26016.1"     "PRAM_26016.1.p1"  "PRAM_26019"      
## [484] "PRAM_26019"       "PRAM_26019"       "PRAM_26019"      
## [487] "PRAM_26019"       "PRAM_26019.1"     "PRAM_26019.1.p1" 
## [490] "PRAM_26020"       "PRAM_26020.1"     "PRAM_26020.1.p1" 
## [493] "PRAM_26068"       "PRAM_26068.1"     "PRAM_26068.1.p1" 
## [496] "PRAM_26082.1"     "PRAM_26082.1"     "PRAM_26082.1.p1" 
## [499] "PRAM_26082.1.p1"  "PRAM_26097"       "PRAM_26097"      
## [502] "PRAM_26097.1"     "PRAM_26097.1.p1"  "PRAM_26097.1.p1" 
## [505] "PRAM_26101.1"     "PRAM_26105"       "PRAM_26105.1"    
## [508] "PRAM_26105.1.p1"  "PRAM_26108"       "PRAM_26108"      
## [511] "PRAM_26108"       "PRAM_26108.1"     "PRAM_26108.1.p1" 
## [514] "PRAM_26108.1.p1"  "PRAM_26108.1.p1"  "PRAM_26108.1.p1" 
## [517] "PRAM_26109"       "PRAM_26109.1"     "PRAM_26109.1"    
## [520] "PRAM_26109.1.p1"  "PRAM_26109.1.p1"  "PRAM_26109.1.p1" 
## [523] "PRAM_26113"       "PRAM_26113"       "PRAM_26113"      
## [526] "PRAM_26113"       "PRAM_26113"       "PRAM_26113"      
## [529] "PRAM_26113"       "PRAM_26113"       "PRAM_26113.1"    
## [532] "PRAM_26113.1"     "PRAM_26113.1"     "PRAM_26113.1"    
## [535] "PRAM_26113.1"     "PRAM_26128"       "PRAM_26128.1"    
## [538] "PRAM_26128.1.p1"  "PRAM_26129"       "PRAM_26129.1"    
## [541] "PRAM_26129.1"     "PRAM_26129.1.p1"  "PRAM_26132"      
## [544] "PRAM_26132"       "PRAM_26132.1"     "PRAM_26132.1"    
## [547] "PRAM_26132.1.p1"  "PRAM_26132.1.p1"  "PRAM_26139"      
## [550] "PRAM_26139"       "PRAM_26139"       "PRAM_26139.1"    
## [553] "PRAM_26139.1"     "PRAM_26139.1"     "PRAM_26139.1"    
## [556] "PRAM_26139.1.p1"  "PRAM_26139.1.p1"  "PRAM_26139.1.p1" 
## [559] "PRAM_26139.1.p1"  "PRAM_26139.1.p1"  "PRAM_26145"      
## [562] "PRAM_26145"       "PRAM_26145.1"     "PRAM_26145.1"    
## [565] "PRAM_26145.1.p1"  "PRAM_26184"       "PRAM_26184.1"    
## [568] "PRAM_26184.1.p1"  "PRAM_26189"       "PRAM_26189"      
## [571] "PRAM_26189"       "PRAM_26189"       "PRAM_26189.1"    
## [574] "PRAM_26189.1"     "PRAM_26189.1"     "PRAM_26189.1"    
## [577] "PRAM_26189.1"     "PRAM_26189.1.p1"  "PRAM_26195.1"    
## [580] "PRAM_26195.1"     "PRAM_26195.1.p1"  "PRAM_26195.1.p1" 
## [583] "PRAM_26207"       "PRAM_26207"       "PRAM_26207.1"    
## [586] "PRAM_26207.1"     "PRAM_26207.1"     "PRAM_26207.1"    
## [589] "PRAM_26207.1.p1"  "PRAM_26211"       "PRAM_26211.1"    
## [592] "PRAM_26211.1.p1"  "PRAM_26212"       "PRAM_26212.1"    
## [595] "PRAM_26212.1.p1"  "PRAM_26216"       "PRAM_26216"      
## [598] "PRAM_26216"       "PRAM_26216"       "PRAM_26216.1"    
## [601] "PRAM_26216.1"     "PRAM_26216.1"     "PRAM_26216.1"    
## [604] "PRAM_26216.1.p1"  "PRAM_26216.1.p1"  "PRAM_26216.1.p1" 
## [607] "PRAM_26216.1.p1"  "PRAM_26218"       "PRAM_26218.1"    
## [610] "PRAM_26218.1.p1"  "PRAM_26220"       "PRAM_26220.1"    
## [613] "PRAM_26220.1.p1"  "PRAM_26242"       "PRAM_26242"      
## [616] "PRAM_26242.1"     "PRAM_26242.1"     "PRAM_26242.1"    
## [619] "PRAM_26242.1.p1"  "PRAM_26242.1.p1"  "PRAM_26243.1"    
## [622] "PRAM_26243.1.p1"  "PRAM_26245"       "PRAM_26245"      
## [625] "PRAM_26245"       "PRAM_26245"       "PRAM_26245.1"    
## [628] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [631] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [634] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [637] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [640] "PRAM_26245.1"     "PRAM_26247"       "PRAM_26247.1"    
## [643] "PRAM_26247.1.p1"  "PRAM_26248"       "PRAM_26248"      
## [646] "PRAM_26248"       "PRAM_26248"       "PRAM_26248.1"    
## [649] "PRAM_26248.1"     "PRAM_26248.1"     "PRAM_26248.1"    
## [652] "PRAM_26248.1.p1"  "PRAM_26248.1.p1"  "PRAM_26248.1.p1" 
## [655] "PRAM_26248.1.p1"  "PRAM_26252"       "PRAM_26252.1"    
## [658] "PRAM_26252.1.p1"  "PRAM_26255"       "PRAM_26255.1"    
## [661] "PRAM_26255.1"     "PRAM_26255.1"     "PRAM_26255.1"    
## [664] "PRAM_26255.1"     "PRAM_26255.1.p1"  "PRAM_26257"      
## [667] "PRAM_26257.1"     "PRAM_26257.1"     "PRAM_26257.1"    
## [670] "PRAM_26257.1.p1"  "PRAM_26262"       "PRAM_26262"      
## [673] "PRAM_26262.1"     "PRAM_26262.1"     "PRAM_26262.1"    
## [676] "PRAM_26262.1.p1"  "PRAM_26262.1.p1"  "PRAM_26263.1"    
## [679] "PRAM_26263.1"     "PRAM_26263.1"     "PRAM_26263.1.p1" 
## [682] "PRAM_26263.1.p1"  "PRAM_26268.1"     "PRAM_26268.1"    
## [685] "PRAM_26268.1"     "PRAM_26273"       "PRAM_26273"      
## [688] "PRAM_26273"       "PRAM_26273"       "PRAM_26273.1"    
## [691] "PRAM_26273.1"     "PRAM_26273.1"     "PRAM_26273.1"    
## [694] "PRAM_26276"       "PRAM_26276.1"     "PRAM_26276.1.p1" 
## [697] "PRAM_26279"       "PRAM_26279"       "PRAM_26279.1"    
## [700] "PRAM_26279.1"     "PRAM_26279.1"     "PRAM_26279.1.p1" 
## [703] "PRAM_26279.1.p1"  "PRAM_26279.1.p1"  "PRAM_26280"      
## [706] "PRAM_26280"       "PRAM_26280.1"     "PRAM_26280.1"    
## [709] "PRAM_26280.1.p1"  "PRAM_26280.1.p1"  "PRAM_26281"      
## [712] "PRAM_26281"       "PRAM_26281.1"     "PRAM_26281.1"    
## [715] "PRAM_26281.1"     "PRAM_26281.1"     "PRAM_26281.1"    
## [718] "PRAM_26281.1.p1"  "PRAM_26281.1.p1"  "PRAM_26296"      
## [721] "PRAM_26296.1"     "PRAM_26296.1.p2"  "PRAM_26298"      
## [724] "PRAM_26298.1"     "PRAM_26301"       "PRAM_26301.1"    
## [727] "PRAM_26301.1.p1"  "PRAM_26302"       "PRAM_26302.1"    
## [730] "PRAM_26302.1.p1"  "PRAM_26307"       "PRAM_26307"      
## [733] "PRAM_26307"       "PRAM_26307.1"     "PRAM_26307.1"    
## [736] "PRAM_26307.1.p1"  "PRAM_26312"       "PRAM_26312.1"    
## [739] "PRAM_26312.1"     "PRAM_26312.1.p1"  "PRAM_26314"      
## [742] "PRAM_26314"       "PRAM_26314.1"     "PRAM_26314.1"    
## [745] "PRAM_26314.1.p1"  "PRAM_26314.1.p1"  "PRAM_26322"      
## [748] "PRAM_26322.1"     "PRAM_26322.1.p1"  "PRAM_26322.1.p1" 
## [751] "PRAM_26322.1.p1"  "PRAM_26326"       "PRAM_26326.1"    
## [754] "PRAM_26326.1.p1"  "PRAM_26326.1.p1"  "PRAM_26326.1.p1" 
## [757] "PRAM_26328"       "PRAM_26328"       "PRAM_26328.1"    
## [760] "PRAM_26328.1"     "PRAM_26331"       "PRAM_26331.1"    
## [763] "PRAM_26331.1"     "PRAM_26334"       "PRAM_26334.1"    
## [766] "PRAM_26334.1"     "PRAM_26363"       "PRAM_26363.1"    
## [769] "PRAM_26363.1.p3"  "PRAM_267"         "PRAM_267"        
## [772] "PRAM_267.1"       "PRAM_267.1"       "PRAM_267.1"      
## [775] "PRAM_267.1"       "PRAM_267.1.p1"    "PRAM_267.1.p1"   
## [778] "PRAM_272"         "PRAM_272.1"       "PRAM_272.1.p1"   
## [781] "PRAM_274.1"       "PRAM_281"         "PRAM_281.1"      
## [784] "PRAM_283"         "PRAM_283.1"       "PRAM_36897"      
## [787] "PRAM_36897.1"     "PRAM_36897.1.p1"  "PRAM_36916"      
## [790] "PRAM_36916"       "PRAM_36916.1"     "PRAM_36916.1"    
## [793] "PRAM_36916.1.p1"  "PRAM_36916.1.p1"  "PRAM_36921"      
## [796] "PRAM_36921.1"     "PRAM_36921.1"     "PRAM_36921.1.p1" 
## [799] "PRAM_36926"       "PRAM_36926"       "PRAM_36926"      
## [802] "PRAM_36926"       "PRAM_36926"       "PRAM_36926.1"    
## [805] "PRAM_36926.1"     "PRAM_36926.1"     "PRAM_36926.1"    
## [808] "PRAM_36938"       "PRAM_36938.1"     "PRAM_36938.1"    
## [811] "PRAM_36938.1"     "PRAM_36938.1.p1"  "PRAM_36938.1.p1" 
## [814] "PRAM_36938.1.p1"  "PRAM_36967"       "PRAM_36967"      
## [817] "PRAM_36967"       "PRAM_36967"       "PRAM_36967.1"    
## [820] "PRAM_36967.1"     "PRAM_36967.1"     "PRAM_36967.1"    
## [823] "PRAM_36967.1.p2"  "PRAM_36976"       "PRAM_36976"      
## [826] "PRAM_36976"       "PRAM_36976.1"     "PRAM_36976.1"    
## [829] "PRAM_36976.1"     "PRAM_36976.1"     "PRAM_36976.1"    
## [832] "PRAM_36976.1"     "PRAM_36976.1.p1"  "PRAM_36976.1.p1" 
## [835] "PRAM_36976.1.p1"  "PRAM_36977.1"     "PRAM_36977.1"    
## [838] "PRAM_36977.1"     "PRAM_36977.1"     "PRAM_36977.1"    
## [841] "PRAM_36997"       "PRAM_36997"       "PRAM_36997"      
## [844] "PRAM_36997.1"     "PRAM_36997.1"     "PRAM_36997.1"    
## [847] "PRAM_36997.1.p1"  "PRAM_36997.1.p1"  "PRAM_36997.1.p1" 
## [850] "PRAM_37001"       "PRAM_37001"       "PRAM_37001.1"    
## [853] "PRAM_37001.1"     "PRAM_37001.1.p1"  "PRAM_37001.1.p1" 
## [856] "PRAM_37004"       "PRAM_37004.1"     "PRAM_37004.1.p1" 
## [859] "PRAM_37006"       "PRAM_37006.1"     "PRAM_37006.1.p1" 
## [862] "PRAM_37008"       "PRAM_37008"       "PRAM_37008"      
## [865] "PRAM_37008.1"     "PRAM_37008.1"     "PRAM_37008.1"    
## [868] "PRAM_37008.1.p1"  "PRAM_37011"       "PRAM_37011"      
## [871] "PRAM_37011.1"     "PRAM_37011.1"     "PRAM_37011.1.p1" 
## [874] "PRAM_37011.1.p1"  "PRAM_43769.1"     "PRAM_43769.1"    
## [877] "PRAM_43769.1.p1"  "PRAM_448.1"       "PRAM_448.1"      
## [880] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [883] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [886] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [889] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [892] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [895] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [898] "PRAM_448.1"       "PRAM_46974"       "PRAM_46974"      
## [901] "PRAM_46974"       "PRAM_46974"       "PRAM_46974"      
## [904] "PRAM_46974"       "PRAM_46974.1"     "PRAM_46974.1"    
## [907] "PRAM_46974.1"     "PRAM_46974.1"     "PRAM_46974.1"    
## [910] "PRAM_46974.1"     "PRAM_46974.1"     "PRAM_46974.1"    
## [913] "PRAM_46974.1"     "PRAM_46974.1"     "PRAM_52553.1"    
## [916] "PRAM_52553.1"     "PRAM_52553.1.p1"  "PRAM_52553.1.p1" 
## [919] "PRAM_66596.1.p1"  "PRAM_66596.1.p1"  "PRAM_66596.1.p1" 
## [922] "PRAM_67540.1"     "PRAM_67549"       "PRAM_67549.1"    
## [925] "PRAM_67549.1"     "PRAM_67549.1.p1"  "PRAM_67549.1.p1" 
## [928] "PRAM_73094"       "PRAM_73094"       "PRAM_73094"      
## [931] "PRAM_73094"       "PRAM_73094"       "PRAM_73094"      
## [934] "PRAM_73094"       "PRAM_758"         "PRAM_758.1"      
## [937] "PRAM_758.1.p1"    "PRAM_80941"       "PRAM_80941"      
## [940] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [943] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [946] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [949] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [952] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [955] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [958] "PRAM_80941.1"     "PRAM_80941.1"     "PRAM_80941.1.p1" 
## [961] "PRAM_80945"       "PRAM_80945.1"     "PRAM_80945.1"    
## [964] "PRAM_80945.1.p1"  "PRAM_8262"        "PRAM_8262"       
## [967] "PRAM_8262"        "PRAM_85890"       "PRAM_85890"      
## [970] "PRAM_85890"       "PRAM_85890"       "PRAM_888"        
## [973] "PRAM_888"         "PRAM_888"         "PRAM_888"        
## [976] "PRAM_888"         "PRAM_888"         "PRAM_888.1"      
## [979] "PRAM_891.1"       "PRAM_95186.1.p1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6052737 323.3   10998861  587.5  13748576  734.3
## Vcells 100323228 765.5  188153622 1435.5 188136477 1435.4

4.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 18219

4.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22707 
##      # sly unigue genes: 23599 
##      # ath highest connection degree:  55 
##      # sly highest connection degree:  58 
##      # genes in ath with degree > 30:  204 
##      # genes in sly with degree > 30:  197
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          46257          18219          13068           6779
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       21434       9378    11042           5277
##   2        7504       2496     1238            874
##   3        5370       1714      437            281
##   4        5075       1682      209            159
##   5        4739       1901      108            132
##   6        2135       1048       34             56
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  84323
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

4.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          20050           7582           3266           1187
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1274       1186     1978            216
##   2        3918        962      634            479
##   3        3711       1081      331            179
##   4        4448       1476      187            131
##   5        4564       1829      102            126
##   6        2135       1048       34             56
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 18491 
##      # sly unigue genes: 20258 
##      # ath highest connection degree:  24 
##      # sly highest connection degree:  17 
##      # genes in ath with degree > 30:  0 
##      # genes in sly with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 17647
## 2:                     compara  6150
## 3:                       PLAZA  5638
## 4: OrthoDB_FastOMA_RBH_MapMan4   567
## 5:     FastOMA_OrthoDB_MapMan4  1265
## 6:         OrthoDB_RBH_MapMan4   415
## 7:         FastOMA_RBH_MapMan4   403
## 8:                      reject 52238

4.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                         184                          61 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          37                         789 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          44                          17 
##                       PLAZA                      reject 
##                         206                        2014
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'vvi'
  , plantNameOut = "grapevine"
  , plantDirOut = file.path('..', 'reports', group, "grapevine")
  , flag = 1
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000021c38750a48>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-67] | |…… | 11% | |…….. | 15% [unnamed-chunk-68] | |……… | 19% | |……….. | 22% [unnamed-chunk-69] | |…………. | 26% | |…………… | 30% [unnamed-chunk-70] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-71] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-72] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-73] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-74] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-75] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-76] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-77] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-78] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-79] | |……………………………………………| 100%

cat(child_content)

5 Subsection: vvi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  13825 61099 18589 49707 18488 26763
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID             to_plant source 
##    <chr>       <chr>      <chr>                 <chr>    <chr>  
##  1 ATCG00380   ath        Vitvi05_01chr00g00010 vvi      FastOMA
##  2 ATCG00780   ath        Vitvi05_01chr00g00040 vvi      FastOMA
##  3 AT4G20320   ath        Vitvi05_01chr19g23100 vvi      FastOMA
##  4 AT1G79580   ath        Vitvi05_01chr19g23110 vvi      FastOMA
##  5 AT1G10350   ath        Vitvi05_01chr01g13730 vvi      MCScanX
##  6 AT1G10370   ath        Vitvi05_01chr01g13610 vvi      MCScanX
##  7 ATMG01190   ath        Vitvi05_01chr10g19400 vvi      MCScanX
##  8 ATMG01280   ath        Vitvi05_01chr10g19500 vvi      MCScanX
##  9 AT1G44800   ath        Vitvi05_01chr18g04300 vvi      OrthoDB
## 10 AT1G21890   ath        Vitvi05_01chr18g04300 vvi      OrthoDB
## 11 AT2G07695   ath        Vitvi05_01chr00g07410 vvi      OrthoDB
## 12 AT2G07695   ath        Vitvi05_01chr10g19710 vvi      OrthoDB
## 13 AT3G20250   ath        Vitvi05_01chr09g20830 vvi      PLAZA  
## 14 AT4G25880   ath        Vitvi05_01chr09g20830 vvi      PLAZA  
## 15 AT3G27027   ath        Vitvi05_01chr17g01390 vvi      PLAZA  
## 16 AT3G01940   ath        Vitvi05_01chr17g01390 vvi      PLAZA  
## 17 AT1G01020   ath        Vitvi05_01chr10g07040 vvi      RBH    
## 18 AT1G01030   ath        Vitvi05_01chr02g03430 vvi      RBH    
## 19 ATMG01410   ath        Vitvi05_01chr00g07980 vvi      RBH    
## 20 ATMG01410   ath        Vitvi05_01chr10g19280 vvi      RBH    
## 21 AT2G07785   ath        Vitvi05_01chr10g19420 vvi      compara
## 22 AT2G07734   ath        Vitvi05_01chr00g07530 vvi      compara
## 23 AT5G47380   ath        Vitvi05_01chr19g16420 vvi      compara
## 24 AT1G79390   ath        Vitvi05_01chr19g22760 vvi      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4131296 220.7   10998861  587.5  13748576  734.3
## Vcells 88158984 672.6  188153622 1435.5 188136477 1435.4

5.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23140
length(unique(dt$to_geneID))
## [1] 24769
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   13825   61099   18589   49707   18488   26763
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

5.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 96387
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6424627 343.2   10998861  587.5  13748576  734.3
## Vcells 107951188 823.7  188153622 1435.5 188136789 1435.4

5.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 21558

5.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 23140 
##      # vvi unigue genes: 24769 
##      # ath highest connection degree:  162 
##      # vvi highest connection degree:  115 
##      # genes in ath with degree > 30:  372 
##      # genes in vvi with degree > 30:  239
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          53971          21558          13305           7553
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       29342      11745    11590           6714
##   2        8088       3523     1082            552
##   3        4810       1668      428            127
##   4        4460       1498      109             83
##   5        4562       1839       65             51
##   6        2709       1285       31             26
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  96387
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

5.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          20841           7247           2063            814
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1456        962     1060            211
##   2        4849        912      450            360
##   3        3544        969      362             91
##   4        3924       1352       98             77
##   5        4359       1767       62             49
##   6        2709       1285       31             26
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19002 
##      # vvi unigue genes: 19327 
##      # ath highest connection degree:  73 
##      # vvi highest connection degree:  18 
##      # genes in ath with degree > 30:  4 
##      # genes in vvi with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 18589
## 2:                     compara  4089
## 3:                       PLAZA  4650
## 4: OrthoDB_FastOMA_RBH_MapMan4   744
## 5:     FastOMA_OrthoDB_MapMan4  1802
## 6:         OrthoDB_RBH_MapMan4   706
## 7:         FastOMA_RBH_MapMan4   385
## 8:                      reject 65422

5.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                         125                         108 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          25                         754 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          67                          33 
##                       PLAZA                      reject 
##                         238                        1593
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
# Step 1: params_list
# params_list <- list(
# ...
# )
# 
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
#   plantName1: NULL
#   plantName2: NULL
#   plantName3: NULL
#   plantName4: NULL
#   plantDirIn: NULL
#   plantNameOut: NULL
#   plantDirOut: NULL
#   pattern_in: NULL
#   pattern_out: NULL
#   compara_pattern_in1: NULL
#   compara_pattern_in2: NULL
#   plaza_pattern_in1: NULL
#   plaza_pattern_in2: NULL
#   ref_genome: NULL
#   mercator: NULL
#   mercatorPatternIn1: NULL
#   mercatorPatternOut1: NULL
#   mercatorPatternIn2: NULL
#   mercatorPatternOut2: NULL
# ---
# 
# 
# access params in the script like:
# params$plantName1
# params$plantDirOut
# 
# Step 3: Call render() like
# rmarkdown::render(
#   input = "09_fruitTrees-child.Rmd",
#   params = params_list,
#   envir = new.env(parent = globalenv()),  # optional: isolate execution
#   output_format = "html_document",
#   quiet = FALSE
# )
# 
# 
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory

6 SessionInfo

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## time zone: Europe/Ljubljana
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2      knitr_1.50         data.table_1.17.0 
## [5] magrittr_2.0.3    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.4.1     zip_2.3.2          Rcpp_1.0.14        tidyselect_1.2.1  
##  [9] stringr_1.5.1      dichromat_2.0-0.1  jquerylib_0.1.4    textshaping_1.0.1 
## [13] systemfonts_1.2.3  scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
## [17] R6_2.6.1           labeling_0.4.3     generics_0.1.4     patchwork_1.3.0   
## [21] igraph_2.1.4       openxlsx_4.2.8     tibble_3.2.1       bslib_0.9.0       
## [25] pillar_1.10.2      RColorBrewer_1.1-3 rlang_1.1.5        utf8_1.2.5        
## [29] cachem_1.1.0       stringi_1.8.7      xfun_0.52          sass_0.4.10       
## [33] cli_3.6.3          withr_3.0.2        digest_0.6.37      grid_4.4.1        
## [37] rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.3    
## [41] glue_1.8.0         farver_2.1.2       ragg_1.4.0         colorspace_2.1-1  
## [45] rmarkdown_2.29     tools_4.4.1        pkgconfig_2.0.3    htmltools_0.5.8.1